ggplotly() and plot_ly()plot_geo()We will work with two Starbucks datasets, one on the store locations (global) and one for the nutritional data for their food and drink items. We will do some text analysis of the menu items.
sb_locs <- read_csv("starbucks-locations.csv", show_col_types = FALSE)
sb_nutr <- read_csv("starbucks-menu-nutrition.csv", show_col_types = FALSE)
usa_pop <- read_csv("us_state_pop.csv", show_col_types = FALSE)
usa_states <- read_csv("states.csv", show_col_types = FALSE)
# Set eval=FALSE
View(sb_locs)
View(sb_nutr)
View(usa_pop)
View(usa_states)
sb_usa <- sb_locs |> filter(Country == "US")
sb_locs_state <- sb_usa |>
group_by(`State/Province`) |>
rename(state_abbr = `State/Province`) |>
summarize(n_stores = n())
# need state abbreviations
usa_pop_abbr <- full_join(
sb_locs_state,
usa_states,
by = join_by(state_abbr == Abbreviation)
)
sb_locs_state <- full_join(
usa_pop_abbr,
usa_pop,
by = join_by(State == state)
)
summary(sb_locs_state)
## state_abbr n_stores State population
## Length:55 Min. : 8.0 Length:55 Min. : 56882
## Class :character 1st Qu.: 56.5 Class :character 1st Qu.: 1344331
## Mode :character Median : 123.0 Mode :character Median : 3751351
## Mean : 266.8 Mean : 5677621
## 3rd Qu.: 332.0 3rd Qu.: 6515716
## Max. :2821.0 Max. :37253956
## NA's :4
ggplotly for EDAAnswer the following questions:
Are the number of Starbucks proportional to the population of a state? (scatterplot)
Is the caloric distribution of Starbucks menu items different for drinks and food? (histogram)
What are the top 20 words in Starbucks menu items? (bar plot)
p1 <- sb_locs_state |>
ggplot(aes(x = n_stores, y = population, color = state_abbr)) +
geom_point()
ggplotly(p1)
p2 <- sb_nutr |>
ggplot(aes(x = Calories, fill = Category)) +
geom_histogram(alpha = 0.5, position = "identity")
ggplotly(p2)
p3 <- sb_nutr |>
unnest_tokens(word, Item) |>
count(word, sort = TRUE) |>
head(20) |>
ggplot(aes(x = word, y = n, fill = word)) +
geom_col() +
coord_flip()
ggplotly(p3) |> layout(showlegend = FALSE)
plot_ly()plot_ly() representing the
relationship between calories and carbssb_nutr |>
plot_ly(
x = ~Calories,
y = ~`Carb. (g)`,
type = "scatter",
mode = "markers",
color = ~Category
)
hovermode = "compare"topwords <- sb_nutr |>
unnest_tokens(word, Item) |>
count(word, sort = TRUE) |>
head(10)
sb_nutr |>
unnest_tokens(word, Item) |>
filter(word %in% topwords$word) |>
plot_ly(
x = ~Calories,
y = ~`Carb. (g)`,
type = "scatter",
mode = "markers",
color = ~Category,
hoverinfo = "text",
hovertext = ~ paste0(
"Item: ", word, "<br>",
"Calories: ", Calories, "<br>",
"Carb. (g): ", `Carb. (g)`
)
) |>
layout(
xaxis = list(title = "Calories"),
yaxis = list(title = "Carbohydrates (g)"),
title = "Carbohydrates vs. Calories for the Top 10 Item Words",
hovermode = "compare"
)
plot_ly Boxplotsfiltered_data <- sb_nutr |>
unnest_tokens(word, Item) |>
filter(word %in% topwords$word)
filtered_data |>
plot_ly(x = ~word, y = ~Calories, type = "box", name = "Calories") |>
add_boxplot(y = ~`Fat (g)`, name = "Fat"
) |>
add_boxplot(y = ~`Carb. (g)`, name = "Carbohydrates") |>
add_boxplot(y = ~`Fiber (g)`, name = "Fiber") |>
add_boxplot(y = ~`Protein (g)`, name = "Protein") |>
layout(
xaxis = list(title = "Item"),
yaxis = list(title = "Nutritional Variables"),
boxmode = "group"
)
filtered_data |>
plot_ly(
x = ~Calories,
y = ~`Carb. (g)`,
z = ~`Protein (g)`,
type = "scatter3d",
mode = "markers",
color = ~word
)
plot_ly Map# Set up mapping details
set_map_details <- list(
scope = "usa",
projection = list(type = "albers usa"),
showlakes = TRUE,
lakecolor = toRGB("steelblue")
)
# Make sure both maps are on the same color scale
shade_limit <- 125
# Create hover text
sb_locs_state$hover <- with(
sb_locs_state,
paste(
"Number of Starbucks: ", n_stores, "<br>",
"State: ", State, "<br>",
"Population: ", population
)
)
# Create the map
map1 <- plot_geo(sb_locs_state, locationmode = "USA-states") |>
add_trace(
z = ~n_stores,
text = ~hover,
locations = ~state_abbr,
color = ~n_stores,
colors = "Reds"
) |>
layout(title = "Starbucks Stores by State", geo = set_map_details)
map2 <- plot_geo(sb_locs_state, locationmode = "USA-states") |>
add_trace(
z = ~population,
text = ~hover,
locations = ~state_abbr,
color = ~population,
colors = "Greens"
) |>
layout(title = "Population by State", geo = set_map_details)
subplot(map1, map2)
We note that there is some association between the number of Starbucks stores and population. For example, California has the most population, and it indeed has the most Starbucks stores. Texas, Washington, New York and Florida have relatively high population and they also have more Starbucks stores than other states.